@aiello/wechat-to-markdown 1.2.12 → 1.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +79 -18
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +10 -16
- package/dist/index.js +75 -17
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -27,13 +27,14 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
27
27
|
var src_exports = {};
|
|
28
28
|
__export(src_exports, {
|
|
29
29
|
Status: () => Status,
|
|
30
|
-
default: () => transformHtml2Markdown,
|
|
31
30
|
getTurnDownService: () => getTurnDownService,
|
|
32
|
-
|
|
31
|
+
isWechatPage: () => isWechatPage,
|
|
32
|
+
parseHTML: () => parseHTML,
|
|
33
|
+
transformHtml2Markdown: () => transformHtml2Markdown,
|
|
34
|
+
transformUrl2Markdown: () => transformUrl2Markdown
|
|
33
35
|
});
|
|
34
36
|
module.exports = __toCommonJS(src_exports);
|
|
35
37
|
var import_axios = __toESM(require("axios"), 1);
|
|
36
|
-
var import_cheerio2 = require("cheerio");
|
|
37
38
|
|
|
38
39
|
// src/error.ts
|
|
39
40
|
var errObj = {
|
|
@@ -47,6 +48,9 @@ var Status = /* @__PURE__ */ ((Status2) => {
|
|
|
47
48
|
return Status2;
|
|
48
49
|
})(Status || {});
|
|
49
50
|
|
|
51
|
+
// src/parsers/wechat.ts
|
|
52
|
+
var import_cheerio2 = require("cheerio");
|
|
53
|
+
|
|
50
54
|
// src/turndownCode.ts
|
|
51
55
|
var import_turndown = __toESM(require("turndown"), 1);
|
|
52
56
|
var import_turndown_plugin_gfm = __toESM(require("@guyplusplus/turndown-plugin-gfm"), 1);
|
|
@@ -150,19 +154,11 @@ function getTurnDownService(params) {
|
|
|
150
154
|
return turndownService;
|
|
151
155
|
}
|
|
152
156
|
|
|
153
|
-
// src/
|
|
154
|
-
|
|
155
|
-
return {
|
|
156
|
-
code,
|
|
157
|
-
success: false,
|
|
158
|
-
msg: errObj[code]
|
|
159
|
-
};
|
|
160
|
-
};
|
|
161
|
-
async function parseHTML(htmlRaw, meta) {
|
|
157
|
+
// src/parsers/wechat.ts
|
|
158
|
+
async function parseWeChatPage(htmlRaw, meta) {
|
|
162
159
|
var _a;
|
|
163
160
|
const $ = (0, import_cheerio2.load)(htmlRaw);
|
|
164
|
-
|
|
165
|
-
title = title.trim() || "";
|
|
161
|
+
const title = ($("#activity-name").text() || "").trim();
|
|
166
162
|
const author = Array.from(
|
|
167
163
|
new Set(
|
|
168
164
|
[
|
|
@@ -173,7 +169,36 @@ async function parseHTML(htmlRaw, meta) {
|
|
|
173
169
|
).join("\n");
|
|
174
170
|
const htmlEl = $("#js_content");
|
|
175
171
|
const html = htmlEl.html();
|
|
176
|
-
if (html
|
|
172
|
+
if (html == null ? void 0 : html.length) {
|
|
173
|
+
let res = getTurnDownService(meta).turndown(html);
|
|
174
|
+
res = `## ${title}
|
|
175
|
+
|
|
176
|
+
## \u4F5C\u8005 ${author}
|
|
177
|
+
|
|
178
|
+
` + res;
|
|
179
|
+
return {
|
|
180
|
+
success: true,
|
|
181
|
+
code: 200 /* Success */,
|
|
182
|
+
data: {
|
|
183
|
+
title,
|
|
184
|
+
author,
|
|
185
|
+
content: res
|
|
186
|
+
}
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// src/parsers/general.ts
|
|
193
|
+
var import_cheerio3 = require("cheerio");
|
|
194
|
+
async function parseGeneralHTML(htmlRaw, meta) {
|
|
195
|
+
var _a;
|
|
196
|
+
const $ = (0, import_cheerio3.load)(htmlRaw);
|
|
197
|
+
$("script").remove();
|
|
198
|
+
const title = ($("title").text() || "").trim();
|
|
199
|
+
const author = (((_a = $('meta[name="author"]')) == null ? void 0 : _a.attr("content")) || "").trim();
|
|
200
|
+
const html = $("body").html();
|
|
201
|
+
if (html == null ? void 0 : html.length) {
|
|
177
202
|
let res = getTurnDownService(meta).turndown(html);
|
|
178
203
|
res = `## ${title}
|
|
179
204
|
|
|
@@ -190,9 +215,42 @@ async function parseHTML(htmlRaw, meta) {
|
|
|
190
215
|
}
|
|
191
216
|
};
|
|
192
217
|
}
|
|
218
|
+
return null;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// src/index.ts
|
|
222
|
+
var getError = (code) => {
|
|
223
|
+
return {
|
|
224
|
+
code,
|
|
225
|
+
success: false,
|
|
226
|
+
msg: errObj[code]
|
|
227
|
+
};
|
|
228
|
+
};
|
|
229
|
+
function isWechatPage(html) {
|
|
230
|
+
return html == null ? void 0 : html.includes("res.wx.qq.com");
|
|
231
|
+
}
|
|
232
|
+
async function parseHTML(html, meta) {
|
|
233
|
+
let result = null;
|
|
234
|
+
if (isWechatPage(html)) {
|
|
235
|
+
result = await parseWeChatPage(html, meta);
|
|
236
|
+
}
|
|
237
|
+
if (!result) {
|
|
238
|
+
result = await parseGeneralHTML(html, meta);
|
|
239
|
+
}
|
|
240
|
+
if (result) {
|
|
241
|
+
return result;
|
|
242
|
+
}
|
|
193
243
|
return getError(400 /* Fail */);
|
|
194
244
|
}
|
|
195
|
-
async function transformHtml2Markdown(
|
|
245
|
+
async function transformHtml2Markdown(html, url) {
|
|
246
|
+
try {
|
|
247
|
+
return parseHTML(html, { url });
|
|
248
|
+
} catch (err) {
|
|
249
|
+
console.log(err);
|
|
250
|
+
return getError(400 /* Fail */);
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
async function transformUrl2Markdown(url, options = {}) {
|
|
196
254
|
const { axiosConfig = {} } = options;
|
|
197
255
|
const { headers = {}, ...restConfig } = axiosConfig;
|
|
198
256
|
const u = new URL(url);
|
|
@@ -209,7 +267,7 @@ async function transformHtml2Markdown(url, options = {}) {
|
|
|
209
267
|
},
|
|
210
268
|
...restConfig
|
|
211
269
|
});
|
|
212
|
-
return
|
|
270
|
+
return transformHtml2Markdown(res.data, url);
|
|
213
271
|
} catch (err) {
|
|
214
272
|
console.log(err);
|
|
215
273
|
return getError(400 /* Fail */);
|
|
@@ -219,6 +277,9 @@ async function transformHtml2Markdown(url, options = {}) {
|
|
|
219
277
|
0 && (module.exports = {
|
|
220
278
|
Status,
|
|
221
279
|
getTurnDownService,
|
|
222
|
-
|
|
280
|
+
isWechatPage,
|
|
281
|
+
parseHTML,
|
|
282
|
+
transformHtml2Markdown,
|
|
283
|
+
transformUrl2Markdown
|
|
223
284
|
});
|
|
224
285
|
//# sourceMappingURL=index.cjs.map
|
package/dist/index.cjs.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/error.ts","../src/type.ts","../src/turndownCode.ts","../src/formatHtml.ts"],"sourcesContent":["import axios, { AxiosRequestConfig } from 'axios'\nimport { load } from 'cheerio'\nimport { errObj } from './error'\nimport type { TurnDownResult } from './type'\nimport { Status } from './type'\nimport { getTurnDownService } from './turndownCode'\n\nconst getError = (code: number) => {\n return {\n code,\n success: false,\n msg: errObj[code],\n }\n}\n\nexport { TurnDownResult, Status }\n\nexport async function parseHTML(htmlRaw: string, meta: { url: string }) {\n const $ = load(htmlRaw)\n\n let title = $('#activity-name').text()\n\n title = title.trim() || ''\n const author = Array.from(\n new Set(\n [\n $('meta[name=\"author\"]')?.attr('content'),\n ...$('#js_name').text().split('\\n'),\n ]\n .map((item) => (item ? item.trim() : ''))\n .filter(Boolean)\n )\n ).join('\\n')\n\n const htmlEl = $('#js_content')\n const html = htmlEl.html()\n\n if (html && html.length > 0) {\n let res = getTurnDownService(meta).turndown(html)\n\n res = `## ${title} \\n \\n` + `## 作者 ${author} \\n \\n` + res\n\n return {\n success: true,\n code: Status.Success,\n data: {\n title,\n author,\n content: res,\n },\n }\n }\n\n return getError(Status.Fail)\n}\n\n/**\n * 支持添加代理服务器\n */\ninterface TransformHtml2MarkdownOptions {\n axiosConfig?: AxiosRequestConfig\n}\n\nexport default async function transformHtml2Markdown(\n url: string,\n options: TransformHtml2MarkdownOptions = {}\n): Promise<TurnDownResult> {\n const { axiosConfig = {} } = options\n const { headers = {}, ...restConfig } = axiosConfig\n\n const u = new URL(url)\n // 移除该参数\n // 避免出现 302 跳转\n u.searchParams.delete('poc_token')\n\n try {\n const res = await axios.get(u.href, {\n timeout: 30000,\n maxRedirects: 5,\n headers: {\n DNT: '1',\n 'Upgrade-Insecure-Requests': '1',\n 'User-Agent':\n 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',\n ...headers,\n },\n ...restConfig,\n })\n\n return parseHTML(res.data, { url: u.href })\n } catch (err) {\n console.log(err)\n return getError(Status.Fail)\n }\n}\n\nexport { getTurnDownService } from './turndownCode'\n","export const errObj: {\n [key: number]: string\n} = {\n '400': '内容解析失败',\n}\n","export interface TurnDownResult {\n success: boolean\n code: number\n data?: {\n title?: string\n author?: string\n content?: string\n }\n msg?: string\n}\n\nexport const enum Status {\n Success = 200,\n Fail = 400,\n}\n","/**\n * html 转换 markdown 格式\n */\nimport turnDownService from 'turndown'\nimport TurndownPluginGfm from '@guyplusplus/turndown-plugin-gfm'\nimport { formatCode, figure2markdown } from './formatHtml'\n\ninterface Params {\n url: string\n}\n\nfunction getTurnDownService(params: Params) {\n const turndownService = new turnDownService({\n codeBlockStyle: 'fenced',\n hr: '',\n })\n\n TurndownPluginGfm.gfm(turndownService)\n\n let videoCounter = 0\n\n // 自定义配置\n turndownService\n .addRule('pre2Code', {\n filter: ['pre'],\n replacement(content, node: any) {\n const len = content.length\n // 微信文章获取到的 content, 会出现首尾都有 '`'\n const isCode = content[0] === '`' && content[len - 1] === '`'\n\n let pre_Markdown = ''\n\n if (isCode) {\n pre_Markdown = formatCode(node.innerHTML)\n }\n\n const res = isCode ? pre_Markdown : content\n\n return '```\\n' + res + '\\n```\\n'\n },\n })\n .addRule('getImage', {\n filter: ['img'],\n replacement(content, node: any) {\n const src = node.getAttribute('data-src') || ''\n\n return src ? `\\n\\n \\n\\n` : ''\n },\n })\n .addRule('video', {\n filter: (node: HTMLElement) => {\n return (\n node.tagName.toLowerCase() === 'iframe' &&\n node.className.includes('video_iframe')\n )\n },\n replacement(content, _node: Node) {\n const node = _node as HTMLIFrameElement\n\n const cover = decodeURIComponent(\n node.getAttribute('data-cover') || ''\n )\n\n const u = new URL(params.url)\n u.hash = `js_mp_video_container_${videoCounter++}`\n\n return cover ? `\\n\\n[](${u.href}) \\n\\n` : ''\n },\n })\n .addRule('lineBreaks', {\n filter: 'br',\n replacement: () => '\\n',\n })\n .addRule('img2Code', {\n filter: ['figure'],\n replacement(content, node: any) {\n const res = figure2markdown(node.innerHTML)\n return res || ''\n },\n })\n\n return turndownService\n}\n\nexport { getTurnDownService }\n","import cheerio from 'cheerio'\n\n/**\n * 微信不同代码风格\n * 1. <code><span>code</span></code>\n * 2. <code><span><span>123</span><br></span></code>\n * turndown 不解析 code 下的 br 标签,需要使用正则替换 br 标签为 \\n 才可以继续解析\n * @param htmlStr\n * @returns\n */\n\nexport function formatCode(htmlStr: string) {\n let code = htmlStr\n\n code = code.replace(/<br>/gi, '\\n')\n\n code = code.replace(/ /gi, ' ')\n\n code = code.replace(/</gi, '<')\n\n code = code.replace(/>/gi, '>')\n\n code = code.replace(/&/gi, '&')\n\n code = code.replace(/"/gi, '\"')\n\n code = code.replace(/'/gi, '‘')\n\n code = code.replace(/×/gi, '*')\n\n code = code.replace(/÷/gi, '%')\n\n const $ = cheerio.load(code)\n\n return $.text()\n}\n\n/**\n * 解决如下格式\n * <figcaption><img><figcaption></figcaption></figcaption>\n * @param figureHTML\n * @returns\n */\nexport function figure2markdown(figureHTML: string) {\n const imgRegex = /<img.*?data-src=['\"](.*?)['\"]/\n\n const descRegex = /\\<figcaption .*?>(.+)<\\/figcaption>/\n\n const imgArr = figureHTML.match(imgRegex)\n\n const descArr = figureHTML.match(descRegex)\n\n let imgUrl = ''\n\n let desc = ''\n\n if (Array.isArray(imgArr)) {\n imgUrl = imgArr[1]\n }\n\n if (Array.isArray(descArr)) {\n desc = descArr[1]\n }\n\n // img 可能没有图片说明\n if (imgUrl) {\n return `\\n\\n  \\n\\n`\n }\n\n return\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,mBAA0C;AAC1C,IAAAA,kBAAqB;;;ACDd,IAAM,SAET;AAAA,EACA,OAAO;AACX;;;ACOO,IAAW,SAAX,kBAAWC,YAAX;AACH,EAAAA,gBAAA,aAAU,OAAV;AACA,EAAAA,gBAAA,UAAO,OAAP;AAFc,SAAAA;AAAA,GAAA;;;ACRlB,sBAA4B;AAC5B,iCAA8B;;;ACJ9B,qBAAoB;AAWb,SAAS,WAAW,SAAiB;AACxC,MAAI,OAAO;AAEX,SAAO,KAAK,QAAQ,UAAU,IAAI;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,WAAW,GAAG;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,YAAY,QAAG;AAEnC,SAAO,KAAK,QAAQ,aAAa,GAAG;AAEpC,SAAO,KAAK,QAAQ,cAAc,GAAG;AAErC,QAAM,IAAI,eAAAC,QAAQ,KAAK,IAAI;AAE3B,SAAO,EAAE,KAAK;AAClB;AAQO,SAAS,gBAAgB,YAAoB;AAChD,QAAM,WAAW;AAEjB,QAAM,YAAY;AAElB,QAAM,SAAS,WAAW,MAAM,QAAQ;AAExC,QAAM,UAAU,WAAW,MAAM,SAAS;AAE1C,MAAI,SAAS;AAEb,MAAI,OAAO;AAEX,MAAI,MAAM,QAAQ,MAAM,GAAG;AACvB,aAAS,OAAO;AAAA,EACpB;AAEA,MAAI,MAAM,QAAQ,OAAO,GAAG;AACxB,WAAO,QAAQ;AAAA,EACnB;AAGA,MAAI,QAAQ;AACR,WAAO;AAAA;AAAA,KAAU,SAAS;AAAA;AAAA;AAAA,EAC9B;AAEA;AACJ;;;AD3DA,SAAS,mBAAmB,QAAgB;AACxC,QAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,IACxC,gBAAgB;AAAA,IAChB,IAAI;AAAA,EACR,CAAC;AAED,6BAAAC,QAAkB,IAAI,eAAe;AAErC,MAAI,eAAe;AAGnB,kBACK,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,QAAQ;AAEpB,YAAM,SAAS,QAAQ,OAAO,OAAO,QAAQ,MAAM,OAAO;AAE1D,UAAI,eAAe;AAEnB,UAAI,QAAQ;AACR,uBAAe,WAAW,KAAK,SAAS;AAAA,MAC5C;AAEA,YAAM,MAAM,SAAS,eAAe;AAEpC,aAAO,UAAU,MAAM;AAAA,IAC3B;AAAA,EACJ,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,KAAK,aAAa,UAAU,KAAK;AAE7C,aAAO,MAAM;AAAA;AAAA,MAAW;AAAA;AAAA,IAAc;AAAA,IAC1C;AAAA,EACJ,CAAC,EACA,QAAQ,SAAS;AAAA,IACd,QAAQ,CAAC,SAAsB;AAC3B,aACI,KAAK,QAAQ,YAAY,MAAM,YAC/B,KAAK,UAAU,SAAS,cAAc;AAAA,IAE9C;AAAA,IACA,YAAY,SAAS,OAAa;AAC9B,YAAM,OAAO;AAEb,YAAM,QAAQ;AAAA,QACV,KAAK,aAAa,YAAY,KAAK;AAAA,MACvC;AAEA,YAAM,IAAI,IAAI,IAAI,OAAO,GAAG;AAC5B,QAAE,OAAO,yBAAyB;AAElC,aAAO,QAAQ;AAAA;AAAA,OAAY,WAAW,EAAE;AAAA;AAAA,IAAe;AAAA,IAC3D;AAAA,EACJ,CAAC,EACA,QAAQ,cAAc;AAAA,IACnB,QAAQ;AAAA,IACR,aAAa,MAAM;AAAA,EACvB,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,QAAQ;AAAA,IACjB,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,gBAAgB,KAAK,SAAS;AAC1C,aAAO,OAAO;AAAA,IAClB;AAAA,EACJ,CAAC;AAEL,SAAO;AACX;;;AH3EA,IAAM,WAAW,CAAC,SAAiB;AAC/B,SAAO;AAAA,IACH;AAAA,IACA,SAAS;AAAA,IACT,KAAK,OAAO;AAAA,EAChB;AACJ;AAIA,eAAsB,UAAU,SAAiB,MAAuB;AAjBxE;AAkBI,QAAM,QAAI,sBAAK,OAAO;AAEtB,MAAI,QAAQ,EAAE,gBAAgB,EAAE,KAAK;AAErC,UAAQ,MAAM,KAAK,KAAK;AACxB,QAAM,SAAS,MAAM;AAAA,IACjB,IAAI;AAAA,MACA;AAAA,SACI,OAAE,qBAAqB,MAAvB,mBAA0B,KAAK;AAAA,QAC/B,GAAG,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,IAAI;AAAA,MACtC,EACK,IAAI,CAAC,SAAU,OAAO,KAAK,KAAK,IAAI,EAAG,EACvC,OAAO,OAAO;AAAA,IACvB;AAAA,EACJ,EAAE,KAAK,IAAI;AAEX,QAAM,SAAS,EAAE,aAAa;AAC9B,QAAM,OAAO,OAAO,KAAK;AAEzB,MAAI,QAAQ,KAAK,SAAS,GAAG;AACzB,QAAI,MAAM,mBAAmB,IAAI,EAAE,SAAS,IAAI;AAEhD,UAAM,MAAM;AAAA;AAAA,kBAAyB;AAAA;AAAA,IAAiB;AAEtD,WAAO;AAAA,MACH,SAAS;AAAA,MACT;AAAA,MACA,MAAM;AAAA,QACF;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACb;AAAA,IACJ;AAAA,EACJ;AAEA,SAAO,uBAAoB;AAC/B;AASA,eAAO,uBACH,KACA,UAAyC,CAAC,GACnB;AACvB,QAAM,EAAE,cAAc,CAAC,EAAE,IAAI;AAC7B,QAAM,EAAE,UAAU,CAAC,MAAM,WAAW,IAAI;AAExC,QAAM,IAAI,IAAI,IAAI,GAAG;AAGrB,IAAE,aAAa,OAAO,WAAW;AAEjC,MAAI;AACA,UAAM,MAAM,MAAM,aAAAC,QAAM,IAAI,EAAE,MAAM;AAAA,MAChC,SAAS;AAAA,MACT,cAAc;AAAA,MACd,SAAS;AAAA,QACL,KAAK;AAAA,QACL,6BAA6B;AAAA,QAC7B,cACI;AAAA,QACJ,GAAG;AAAA,MACP;AAAA,MACA,GAAG;AAAA,IACP,CAAC;AAED,WAAO,UAAU,IAAI,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC;AAAA,EAC9C,SAAS,KAAP;AACE,YAAQ,IAAI,GAAG;AACf,WAAO,uBAAoB;AAAA,EAC/B;AACJ;","names":["import_cheerio","Status","cheerio","turnDownService","TurndownPluginGfm","axios"]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/error.ts","../src/type.ts","../src/parsers/wechat.ts","../src/turndownCode.ts","../src/formatHtml.ts","../src/parsers/general.ts"],"sourcesContent":["import axios, { AxiosRequestConfig } from 'axios'\nimport { errObj } from './error'\nimport type { TurnDownResult } from './type'\nimport { Status } from './type'\nimport { parseWeChatPage } from './parsers/wechat'\nimport { parseGeneralHTML } from './parsers/general'\n\nconst getError = (code: number) => {\n return {\n code,\n success: false,\n msg: errObj[code],\n }\n}\n\nexport { TurnDownResult, Status }\n\nexport function isWechatPage(html: string) {\n return html?.includes('res.wx.qq.com')\n}\n\nexport async function parseHTML(\n html: string,\n meta: { url: string }\n): Promise<TurnDownResult> {\n let result: TurnDownResult | null = null\n\n if (isWechatPage(html)) {\n result = await parseWeChatPage(html, meta)\n }\n if (!result) {\n // 兜底处理\n result = await parseGeneralHTML(html, meta)\n }\n\n if (result) {\n return result\n }\n\n return getError(Status.Fail)\n}\n\nexport async function transformHtml2Markdown(\n html: string,\n /**\n * 这里的 url 是原始的 url,主要是用来映射内部跳转链接\n */\n url: string\n): Promise<TurnDownResult> {\n try {\n return parseHTML(html, { url })\n } catch (err) {\n console.log(err)\n return getError(Status.Fail)\n }\n}\n\n/**\n * 支持添加代理服务器\n */\ninterface TransformHtml2MarkdownOptions {\n axiosConfig?: AxiosRequestConfig\n}\n\nexport async function transformUrl2Markdown(\n url: string,\n options: TransformHtml2MarkdownOptions = {}\n): Promise<TurnDownResult> {\n const { axiosConfig = {} } = options\n const { headers = {}, ...restConfig } = axiosConfig\n\n const u = new URL(url)\n // 移除该参数\n // 避免出现 302 跳转\n u.searchParams.delete('poc_token')\n\n try {\n const res = await axios.get(u.href, {\n timeout: 30000,\n maxRedirects: 5,\n headers: {\n DNT: '1',\n 'Upgrade-Insecure-Requests': '1',\n 'User-Agent':\n 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',\n ...headers,\n },\n ...restConfig,\n })\n\n return transformHtml2Markdown(res.data, url)\n } catch (err) {\n console.log(err)\n return getError(Status.Fail)\n }\n}\n\nexport { getTurnDownService } from './turndownCode'\n","export const errObj: {\n [key: number]: string\n} = {\n '400': '内容解析失败',\n}\n","export interface TurnDownResult {\n success: boolean\n code: number\n data?: {\n title?: string\n author?: string\n content?: string\n }\n msg?: string\n}\n\nexport const enum Status {\n Success = 200,\n Fail = 400,\n}\n","import { load } from 'cheerio'\nimport { Status } from '../type'\nimport { getTurnDownService } from '../turndownCode'\n\nexport async function parseWeChatPage(htmlRaw: string, meta: { url: string }) {\n const $ = load(htmlRaw)\n\n const title = ($('#activity-name').text() || '').trim()\n const author = Array.from(\n new Set(\n [\n $('meta[name=\"author\"]')?.attr('content'),\n ...$('#js_name').text().split('\\n'),\n ]\n .map((item) => (item ? item.trim() : ''))\n .filter(Boolean)\n )\n ).join('\\n')\n\n const htmlEl = $('#js_content')\n const html = htmlEl.html()\n\n if (html?.length) {\n let res = getTurnDownService(meta).turndown(html)\n\n res = `## ${title} \\n \\n` + `## 作者 ${author} \\n \\n` + res\n\n return {\n success: true,\n code: Status.Success,\n data: {\n title,\n author,\n content: res,\n },\n }\n }\n\n return null\n}\n","/**\n * html 转换 markdown 格式\n */\nimport turnDownService from 'turndown'\nimport TurndownPluginGfm from '@guyplusplus/turndown-plugin-gfm'\nimport { formatCode, figure2markdown } from './formatHtml'\n\ninterface Params {\n url: string\n}\n\nfunction getTurnDownService(params: Params) {\n const turndownService = new turnDownService({\n codeBlockStyle: 'fenced',\n hr: '',\n })\n\n TurndownPluginGfm.gfm(turndownService)\n\n let videoCounter = 0\n\n // 自定义配置\n turndownService\n .addRule('pre2Code', {\n filter: ['pre'],\n replacement(content, node: any) {\n const len = content.length\n // 微信文章获取到的 content, 会出现首尾都有 '`'\n const isCode = content[0] === '`' && content[len - 1] === '`'\n\n let pre_Markdown = ''\n\n if (isCode) {\n pre_Markdown = formatCode(node.innerHTML)\n }\n\n const res = isCode ? pre_Markdown : content\n\n return '```\\n' + res + '\\n```\\n'\n },\n })\n .addRule('getImage', {\n filter: ['img'],\n replacement(content, node: any) {\n const src = node.getAttribute('data-src') || ''\n\n return src ? `\\n\\n \\n\\n` : ''\n },\n })\n .addRule('video', {\n filter: (node: HTMLElement) => {\n return (\n node.tagName.toLowerCase() === 'iframe' &&\n node.className.includes('video_iframe')\n )\n },\n replacement(content, _node: Node) {\n const node = _node as HTMLIFrameElement\n\n const cover = decodeURIComponent(\n node.getAttribute('data-cover') || ''\n )\n\n const u = new URL(params.url)\n u.hash = `js_mp_video_container_${videoCounter++}`\n\n return cover ? `\\n\\n[](${u.href}) \\n\\n` : ''\n },\n })\n .addRule('lineBreaks', {\n filter: 'br',\n replacement: () => '\\n',\n })\n .addRule('img2Code', {\n filter: ['figure'],\n replacement(content, node: any) {\n const res = figure2markdown(node.innerHTML)\n return res || ''\n },\n })\n\n return turndownService\n}\n\nexport { getTurnDownService }\n","import cheerio from 'cheerio'\n\n/**\n * 微信不同代码风格\n * 1. <code><span>code</span></code>\n * 2. <code><span><span>123</span><br></span></code>\n * turndown 不解析 code 下的 br 标签,需要使用正则替换 br 标签为 \\n 才可以继续解析\n * @param htmlStr\n * @returns\n */\n\nexport function formatCode(htmlStr: string) {\n let code = htmlStr\n\n code = code.replace(/<br>/gi, '\\n')\n\n code = code.replace(/ /gi, ' ')\n\n code = code.replace(/</gi, '<')\n\n code = code.replace(/>/gi, '>')\n\n code = code.replace(/&/gi, '&')\n\n code = code.replace(/"/gi, '\"')\n\n code = code.replace(/'/gi, '‘')\n\n code = code.replace(/×/gi, '*')\n\n code = code.replace(/÷/gi, '%')\n\n const $ = cheerio.load(code)\n\n return $.text()\n}\n\n/**\n * 解决如下格式\n * <figcaption><img><figcaption></figcaption></figcaption>\n * @param figureHTML\n * @returns\n */\nexport function figure2markdown(figureHTML: string) {\n const imgRegex = /<img.*?data-src=['\"](.*?)['\"]/\n\n const descRegex = /\\<figcaption .*?>(.+)<\\/figcaption>/\n\n const imgArr = figureHTML.match(imgRegex)\n\n const descArr = figureHTML.match(descRegex)\n\n let imgUrl = ''\n\n let desc = ''\n\n if (Array.isArray(imgArr)) {\n imgUrl = imgArr[1]\n }\n\n if (Array.isArray(descArr)) {\n desc = descArr[1]\n }\n\n // img 可能没有图片说明\n if (imgUrl) {\n return `\\n\\n  \\n\\n`\n }\n\n return\n}\n","import { load } from 'cheerio'\nimport { Status } from '../type'\nimport { getTurnDownService } from '../turndownCode'\n\nexport async function parseGeneralHTML(htmlRaw: string, meta: { url: string }) {\n const $ = load(htmlRaw)\n $('script').remove()\n\n const title = ($('title').text() || '').trim()\n const author = ($('meta[name=\"author\"]')?.attr('content') || '').trim()\n\n const html = $('body').html()\n if (html?.length) {\n let res = getTurnDownService(meta).turndown(html)\n\n res = `## ${title} \\n \\n` + `## 作者 ${author} \\n \\n` + res\n\n return {\n success: true,\n code: Status.Success,\n data: {\n title,\n author,\n content: res,\n },\n }\n }\n\n return null\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,mBAA0C;;;ACAnC,IAAM,SAET;AAAA,EACA,OAAO;AACX;;;ACOO,IAAW,SAAX,kBAAWA,YAAX;AACH,EAAAA,gBAAA,aAAU,OAAV;AACA,EAAAA,gBAAA,UAAO,OAAP;AAFc,SAAAA;AAAA,GAAA;;;ACXlB,IAAAC,kBAAqB;;;ACGrB,sBAA4B;AAC5B,iCAA8B;;;ACJ9B,qBAAoB;AAWb,SAAS,WAAW,SAAiB;AACxC,MAAI,OAAO;AAEX,SAAO,KAAK,QAAQ,UAAU,IAAI;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,WAAW,GAAG;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,YAAY,QAAG;AAEnC,SAAO,KAAK,QAAQ,aAAa,GAAG;AAEpC,SAAO,KAAK,QAAQ,cAAc,GAAG;AAErC,QAAM,IAAI,eAAAC,QAAQ,KAAK,IAAI;AAE3B,SAAO,EAAE,KAAK;AAClB;AAQO,SAAS,gBAAgB,YAAoB;AAChD,QAAM,WAAW;AAEjB,QAAM,YAAY;AAElB,QAAM,SAAS,WAAW,MAAM,QAAQ;AAExC,QAAM,UAAU,WAAW,MAAM,SAAS;AAE1C,MAAI,SAAS;AAEb,MAAI,OAAO;AAEX,MAAI,MAAM,QAAQ,MAAM,GAAG;AACvB,aAAS,OAAO;AAAA,EACpB;AAEA,MAAI,MAAM,QAAQ,OAAO,GAAG;AACxB,WAAO,QAAQ;AAAA,EACnB;AAGA,MAAI,QAAQ;AACR,WAAO;AAAA;AAAA,KAAU,SAAS;AAAA;AAAA;AAAA,EAC9B;AAEA;AACJ;;;AD3DA,SAAS,mBAAmB,QAAgB;AACxC,QAAM,kBAAkB,IAAI,gBAAAC,QAAgB;AAAA,IACxC,gBAAgB;AAAA,IAChB,IAAI;AAAA,EACR,CAAC;AAED,6BAAAC,QAAkB,IAAI,eAAe;AAErC,MAAI,eAAe;AAGnB,kBACK,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,QAAQ;AAEpB,YAAM,SAAS,QAAQ,OAAO,OAAO,QAAQ,MAAM,OAAO;AAE1D,UAAI,eAAe;AAEnB,UAAI,QAAQ;AACR,uBAAe,WAAW,KAAK,SAAS;AAAA,MAC5C;AAEA,YAAM,MAAM,SAAS,eAAe;AAEpC,aAAO,UAAU,MAAM;AAAA,IAC3B;AAAA,EACJ,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,KAAK,aAAa,UAAU,KAAK;AAE7C,aAAO,MAAM;AAAA;AAAA,MAAW;AAAA;AAAA,IAAc;AAAA,IAC1C;AAAA,EACJ,CAAC,EACA,QAAQ,SAAS;AAAA,IACd,QAAQ,CAAC,SAAsB;AAC3B,aACI,KAAK,QAAQ,YAAY,MAAM,YAC/B,KAAK,UAAU,SAAS,cAAc;AAAA,IAE9C;AAAA,IACA,YAAY,SAAS,OAAa;AAC9B,YAAM,OAAO;AAEb,YAAM,QAAQ;AAAA,QACV,KAAK,aAAa,YAAY,KAAK;AAAA,MACvC;AAEA,YAAM,IAAI,IAAI,IAAI,OAAO,GAAG;AAC5B,QAAE,OAAO,yBAAyB;AAElC,aAAO,QAAQ;AAAA;AAAA,OAAY,WAAW,EAAE;AAAA;AAAA,IAAe;AAAA,IAC3D;AAAA,EACJ,CAAC,EACA,QAAQ,cAAc;AAAA,IACnB,QAAQ;AAAA,IACR,aAAa,MAAM;AAAA,EACvB,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,QAAQ;AAAA,IACjB,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,gBAAgB,KAAK,SAAS;AAC1C,aAAO,OAAO;AAAA,IAClB;AAAA,EACJ,CAAC;AAEL,SAAO;AACX;;;AD9EA,eAAsB,gBAAgB,SAAiB,MAAuB;AAJ9E;AAKI,QAAM,QAAI,sBAAK,OAAO;AAEtB,QAAM,SAAS,EAAE,gBAAgB,EAAE,KAAK,KAAK,IAAI,KAAK;AACtD,QAAM,SAAS,MAAM;AAAA,IACjB,IAAI;AAAA,MACA;AAAA,SACI,OAAE,qBAAqB,MAAvB,mBAA0B,KAAK;AAAA,QAC/B,GAAG,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,IAAI;AAAA,MACtC,EACK,IAAI,CAAC,SAAU,OAAO,KAAK,KAAK,IAAI,EAAG,EACvC,OAAO,OAAO;AAAA,IACvB;AAAA,EACJ,EAAE,KAAK,IAAI;AAEX,QAAM,SAAS,EAAE,aAAa;AAC9B,QAAM,OAAO,OAAO,KAAK;AAEzB,MAAI,6BAAM,QAAQ;AACd,QAAI,MAAM,mBAAmB,IAAI,EAAE,SAAS,IAAI;AAEhD,UAAM,MAAM;AAAA;AAAA,kBAAyB;AAAA;AAAA,IAAiB;AAEtD,WAAO;AAAA,MACH,SAAS;AAAA,MACT;AAAA,MACA,MAAM;AAAA,QACF;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACb;AAAA,IACJ;AAAA,EACJ;AAEA,SAAO;AACX;;;AGvCA,IAAAC,kBAAqB;AAIrB,eAAsB,iBAAiB,SAAiB,MAAuB;AAJ/E;AAKI,QAAM,QAAI,sBAAK,OAAO;AACtB,IAAE,QAAQ,EAAE,OAAO;AAEnB,QAAM,SAAS,EAAE,OAAO,EAAE,KAAK,KAAK,IAAI,KAAK;AAC7C,QAAM,YAAU,OAAE,qBAAqB,MAAvB,mBAA0B,KAAK,eAAc,IAAI,KAAK;AAEtE,QAAM,OAAO,EAAE,MAAM,EAAE,KAAK;AAC5B,MAAI,6BAAM,QAAQ;AACd,QAAI,MAAM,mBAAmB,IAAI,EAAE,SAAS,IAAI;AAEhD,UAAM,MAAM;AAAA;AAAA,kBAAyB;AAAA;AAAA,IAAiB;AAEtD,WAAO;AAAA,MACH,SAAS;AAAA,MACT;AAAA,MACA,MAAM;AAAA,QACF;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACb;AAAA,IACJ;AAAA,EACJ;AAEA,SAAO;AACX;;;ANtBA,IAAM,WAAW,CAAC,SAAiB;AAC/B,SAAO;AAAA,IACH;AAAA,IACA,SAAS;AAAA,IACT,KAAK,OAAO;AAAA,EAChB;AACJ;AAIO,SAAS,aAAa,MAAc;AACvC,SAAO,6BAAM,SAAS;AAC1B;AAEA,eAAsB,UAClB,MACA,MACuB;AACvB,MAAI,SAAgC;AAEpC,MAAI,aAAa,IAAI,GAAG;AACpB,aAAS,MAAM,gBAAgB,MAAM,IAAI;AAAA,EAC7C;AACA,MAAI,CAAC,QAAQ;AAET,aAAS,MAAM,iBAAiB,MAAM,IAAI;AAAA,EAC9C;AAEA,MAAI,QAAQ;AACR,WAAO;AAAA,EACX;AAEA,SAAO,uBAAoB;AAC/B;AAEA,eAAsB,uBAClB,MAIA,KACuB;AACvB,MAAI;AACA,WAAO,UAAU,MAAM,EAAE,IAAI,CAAC;AAAA,EAClC,SAAS,KAAP;AACE,YAAQ,IAAI,GAAG;AACf,WAAO,uBAAoB;AAAA,EAC/B;AACJ;AASA,eAAsB,sBAClB,KACA,UAAyC,CAAC,GACnB;AACvB,QAAM,EAAE,cAAc,CAAC,EAAE,IAAI;AAC7B,QAAM,EAAE,UAAU,CAAC,MAAM,WAAW,IAAI;AAExC,QAAM,IAAI,IAAI,IAAI,GAAG;AAGrB,IAAE,aAAa,OAAO,WAAW;AAEjC,MAAI;AACA,UAAM,MAAM,MAAM,aAAAC,QAAM,IAAI,EAAE,MAAM;AAAA,MAChC,SAAS;AAAA,MACT,cAAc;AAAA,MACd,SAAS;AAAA,QACL,KAAK;AAAA,QACL,6BAA6B;AAAA,QAC7B,cACI;AAAA,QACJ,GAAG;AAAA,MACP;AAAA,MACA,GAAG;AAAA,IACP,CAAC;AAED,WAAO,uBAAuB,IAAI,MAAM,GAAG;AAAA,EAC/C,SAAS,KAAP;AACE,YAAQ,IAAI,GAAG;AACf,WAAO,uBAAoB;AAAA,EAC/B;AACJ;","names":["Status","import_cheerio","cheerio","turnDownService","TurndownPluginGfm","import_cheerio","axios"]}
|
package/dist/index.d.ts
CHANGED
|
@@ -25,27 +25,21 @@ interface Params {
|
|
|
25
25
|
}
|
|
26
26
|
declare function getTurnDownService(params: Params): turnDownService;
|
|
27
27
|
|
|
28
|
-
declare function
|
|
28
|
+
declare function isWechatPage(html: string): boolean;
|
|
29
|
+
declare function parseHTML(html: string, meta: {
|
|
29
30
|
url: string;
|
|
30
|
-
}): Promise<
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
code: Status;
|
|
37
|
-
data: {
|
|
38
|
-
title: string;
|
|
39
|
-
author: string;
|
|
40
|
-
content: string;
|
|
41
|
-
};
|
|
42
|
-
}>;
|
|
31
|
+
}): Promise<TurnDownResult>;
|
|
32
|
+
declare function transformHtml2Markdown(html: string,
|
|
33
|
+
/**
|
|
34
|
+
* 这里的 url 是原始的 url,主要是用来映射内部跳转链接
|
|
35
|
+
*/
|
|
36
|
+
url: string): Promise<TurnDownResult>;
|
|
43
37
|
/**
|
|
44
38
|
* 支持添加代理服务器
|
|
45
39
|
*/
|
|
46
40
|
interface TransformHtml2MarkdownOptions {
|
|
47
41
|
axiosConfig?: AxiosRequestConfig;
|
|
48
42
|
}
|
|
49
|
-
declare function
|
|
43
|
+
declare function transformUrl2Markdown(url: string, options?: TransformHtml2MarkdownOptions): Promise<TurnDownResult>;
|
|
50
44
|
|
|
51
|
-
export { Status, TurnDownResult,
|
|
45
|
+
export { Status, TurnDownResult, getTurnDownService, isWechatPage, parseHTML, transformHtml2Markdown, transformUrl2Markdown };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
// src/index.ts
|
|
2
2
|
import axios from "axios";
|
|
3
|
-
import { load } from "cheerio";
|
|
4
3
|
|
|
5
4
|
// src/error.ts
|
|
6
5
|
var errObj = {
|
|
@@ -14,6 +13,9 @@ var Status = /* @__PURE__ */ ((Status2) => {
|
|
|
14
13
|
return Status2;
|
|
15
14
|
})(Status || {});
|
|
16
15
|
|
|
16
|
+
// src/parsers/wechat.ts
|
|
17
|
+
import { load } from "cheerio";
|
|
18
|
+
|
|
17
19
|
// src/turndownCode.ts
|
|
18
20
|
import turnDownService from "turndown";
|
|
19
21
|
import TurndownPluginGfm from "@guyplusplus/turndown-plugin-gfm";
|
|
@@ -117,19 +119,11 @@ function getTurnDownService(params) {
|
|
|
117
119
|
return turndownService;
|
|
118
120
|
}
|
|
119
121
|
|
|
120
|
-
// src/
|
|
121
|
-
|
|
122
|
-
return {
|
|
123
|
-
code,
|
|
124
|
-
success: false,
|
|
125
|
-
msg: errObj[code]
|
|
126
|
-
};
|
|
127
|
-
};
|
|
128
|
-
async function parseHTML(htmlRaw, meta) {
|
|
122
|
+
// src/parsers/wechat.ts
|
|
123
|
+
async function parseWeChatPage(htmlRaw, meta) {
|
|
129
124
|
var _a;
|
|
130
125
|
const $ = load(htmlRaw);
|
|
131
|
-
|
|
132
|
-
title = title.trim() || "";
|
|
126
|
+
const title = ($("#activity-name").text() || "").trim();
|
|
133
127
|
const author = Array.from(
|
|
134
128
|
new Set(
|
|
135
129
|
[
|
|
@@ -140,7 +134,36 @@ async function parseHTML(htmlRaw, meta) {
|
|
|
140
134
|
).join("\n");
|
|
141
135
|
const htmlEl = $("#js_content");
|
|
142
136
|
const html = htmlEl.html();
|
|
143
|
-
if (html
|
|
137
|
+
if (html == null ? void 0 : html.length) {
|
|
138
|
+
let res = getTurnDownService(meta).turndown(html);
|
|
139
|
+
res = `## ${title}
|
|
140
|
+
|
|
141
|
+
## \u4F5C\u8005 ${author}
|
|
142
|
+
|
|
143
|
+
` + res;
|
|
144
|
+
return {
|
|
145
|
+
success: true,
|
|
146
|
+
code: 200 /* Success */,
|
|
147
|
+
data: {
|
|
148
|
+
title,
|
|
149
|
+
author,
|
|
150
|
+
content: res
|
|
151
|
+
}
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// src/parsers/general.ts
|
|
158
|
+
import { load as load2 } from "cheerio";
|
|
159
|
+
async function parseGeneralHTML(htmlRaw, meta) {
|
|
160
|
+
var _a;
|
|
161
|
+
const $ = load2(htmlRaw);
|
|
162
|
+
$("script").remove();
|
|
163
|
+
const title = ($("title").text() || "").trim();
|
|
164
|
+
const author = (((_a = $('meta[name="author"]')) == null ? void 0 : _a.attr("content")) || "").trim();
|
|
165
|
+
const html = $("body").html();
|
|
166
|
+
if (html == null ? void 0 : html.length) {
|
|
144
167
|
let res = getTurnDownService(meta).turndown(html);
|
|
145
168
|
res = `## ${title}
|
|
146
169
|
|
|
@@ -157,9 +180,42 @@ async function parseHTML(htmlRaw, meta) {
|
|
|
157
180
|
}
|
|
158
181
|
};
|
|
159
182
|
}
|
|
183
|
+
return null;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// src/index.ts
|
|
187
|
+
var getError = (code) => {
|
|
188
|
+
return {
|
|
189
|
+
code,
|
|
190
|
+
success: false,
|
|
191
|
+
msg: errObj[code]
|
|
192
|
+
};
|
|
193
|
+
};
|
|
194
|
+
function isWechatPage(html) {
|
|
195
|
+
return html == null ? void 0 : html.includes("res.wx.qq.com");
|
|
196
|
+
}
|
|
197
|
+
async function parseHTML(html, meta) {
|
|
198
|
+
let result = null;
|
|
199
|
+
if (isWechatPage(html)) {
|
|
200
|
+
result = await parseWeChatPage(html, meta);
|
|
201
|
+
}
|
|
202
|
+
if (!result) {
|
|
203
|
+
result = await parseGeneralHTML(html, meta);
|
|
204
|
+
}
|
|
205
|
+
if (result) {
|
|
206
|
+
return result;
|
|
207
|
+
}
|
|
160
208
|
return getError(400 /* Fail */);
|
|
161
209
|
}
|
|
162
|
-
async function transformHtml2Markdown(
|
|
210
|
+
async function transformHtml2Markdown(html, url) {
|
|
211
|
+
try {
|
|
212
|
+
return parseHTML(html, { url });
|
|
213
|
+
} catch (err) {
|
|
214
|
+
console.log(err);
|
|
215
|
+
return getError(400 /* Fail */);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
async function transformUrl2Markdown(url, options = {}) {
|
|
163
219
|
const { axiosConfig = {} } = options;
|
|
164
220
|
const { headers = {}, ...restConfig } = axiosConfig;
|
|
165
221
|
const u = new URL(url);
|
|
@@ -176,7 +232,7 @@ async function transformHtml2Markdown(url, options = {}) {
|
|
|
176
232
|
},
|
|
177
233
|
...restConfig
|
|
178
234
|
});
|
|
179
|
-
return
|
|
235
|
+
return transformHtml2Markdown(res.data, url);
|
|
180
236
|
} catch (err) {
|
|
181
237
|
console.log(err);
|
|
182
238
|
return getError(400 /* Fail */);
|
|
@@ -184,8 +240,10 @@ async function transformHtml2Markdown(url, options = {}) {
|
|
|
184
240
|
}
|
|
185
241
|
export {
|
|
186
242
|
Status,
|
|
187
|
-
transformHtml2Markdown as default,
|
|
188
243
|
getTurnDownService,
|
|
189
|
-
|
|
244
|
+
isWechatPage,
|
|
245
|
+
parseHTML,
|
|
246
|
+
transformHtml2Markdown,
|
|
247
|
+
transformUrl2Markdown
|
|
190
248
|
};
|
|
191
249
|
//# sourceMappingURL=index.js.map
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/index.ts","../src/error.ts","../src/type.ts","../src/turndownCode.ts","../src/formatHtml.ts"],"sourcesContent":["import axios, { AxiosRequestConfig } from 'axios'\nimport { load } from 'cheerio'\nimport { errObj } from './error'\nimport type { TurnDownResult } from './type'\nimport { Status } from './type'\nimport { getTurnDownService } from './turndownCode'\n\nconst getError = (code: number) => {\n return {\n code,\n success: false,\n msg: errObj[code],\n }\n}\n\nexport { TurnDownResult, Status }\n\nexport async function parseHTML(htmlRaw: string, meta: { url: string }) {\n const $ = load(htmlRaw)\n\n let title = $('#activity-name').text()\n\n title = title.trim() || ''\n const author = Array.from(\n new Set(\n [\n $('meta[name=\"author\"]')?.attr('content'),\n ...$('#js_name').text().split('\\n'),\n ]\n .map((item) => (item ? item.trim() : ''))\n .filter(Boolean)\n )\n ).join('\\n')\n\n const htmlEl = $('#js_content')\n const html = htmlEl.html()\n\n if (html && html.length > 0) {\n let res = getTurnDownService(meta).turndown(html)\n\n res = `## ${title} \\n \\n` + `## 作者 ${author} \\n \\n` + res\n\n return {\n success: true,\n code: Status.Success,\n data: {\n title,\n author,\n content: res,\n },\n }\n }\n\n return getError(Status.Fail)\n}\n\n/**\n * 支持添加代理服务器\n */\ninterface TransformHtml2MarkdownOptions {\n axiosConfig?: AxiosRequestConfig\n}\n\nexport default async function transformHtml2Markdown(\n url: string,\n options: TransformHtml2MarkdownOptions = {}\n): Promise<TurnDownResult> {\n const { axiosConfig = {} } = options\n const { headers = {}, ...restConfig } = axiosConfig\n\n const u = new URL(url)\n // 移除该参数\n // 避免出现 302 跳转\n u.searchParams.delete('poc_token')\n\n try {\n const res = await axios.get(u.href, {\n timeout: 30000,\n maxRedirects: 5,\n headers: {\n DNT: '1',\n 'Upgrade-Insecure-Requests': '1',\n 'User-Agent':\n 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',\n ...headers,\n },\n ...restConfig,\n })\n\n return parseHTML(res.data, { url: u.href })\n } catch (err) {\n console.log(err)\n return getError(Status.Fail)\n }\n}\n\nexport { getTurnDownService } from './turndownCode'\n","export const errObj: {\n [key: number]: string\n} = {\n '400': '内容解析失败',\n}\n","export interface TurnDownResult {\n success: boolean\n code: number\n data?: {\n title?: string\n author?: string\n content?: string\n }\n msg?: string\n}\n\nexport const enum Status {\n Success = 200,\n Fail = 400,\n}\n","/**\n * html 转换 markdown 格式\n */\nimport turnDownService from 'turndown'\nimport TurndownPluginGfm from '@guyplusplus/turndown-plugin-gfm'\nimport { formatCode, figure2markdown } from './formatHtml'\n\ninterface Params {\n url: string\n}\n\nfunction getTurnDownService(params: Params) {\n const turndownService = new turnDownService({\n codeBlockStyle: 'fenced',\n hr: '',\n })\n\n TurndownPluginGfm.gfm(turndownService)\n\n let videoCounter = 0\n\n // 自定义配置\n turndownService\n .addRule('pre2Code', {\n filter: ['pre'],\n replacement(content, node: any) {\n const len = content.length\n // 微信文章获取到的 content, 会出现首尾都有 '`'\n const isCode = content[0] === '`' && content[len - 1] === '`'\n\n let pre_Markdown = ''\n\n if (isCode) {\n pre_Markdown = formatCode(node.innerHTML)\n }\n\n const res = isCode ? pre_Markdown : content\n\n return '```\\n' + res + '\\n```\\n'\n },\n })\n .addRule('getImage', {\n filter: ['img'],\n replacement(content, node: any) {\n const src = node.getAttribute('data-src') || ''\n\n return src ? `\\n\\n \\n\\n` : ''\n },\n })\n .addRule('video', {\n filter: (node: HTMLElement) => {\n return (\n node.tagName.toLowerCase() === 'iframe' &&\n node.className.includes('video_iframe')\n )\n },\n replacement(content, _node: Node) {\n const node = _node as HTMLIFrameElement\n\n const cover = decodeURIComponent(\n node.getAttribute('data-cover') || ''\n )\n\n const u = new URL(params.url)\n u.hash = `js_mp_video_container_${videoCounter++}`\n\n return cover ? `\\n\\n[](${u.href}) \\n\\n` : ''\n },\n })\n .addRule('lineBreaks', {\n filter: 'br',\n replacement: () => '\\n',\n })\n .addRule('img2Code', {\n filter: ['figure'],\n replacement(content, node: any) {\n const res = figure2markdown(node.innerHTML)\n return res || ''\n },\n })\n\n return turndownService\n}\n\nexport { getTurnDownService }\n","import cheerio from 'cheerio'\n\n/**\n * 微信不同代码风格\n * 1. <code><span>code</span></code>\n * 2. <code><span><span>123</span><br></span></code>\n * turndown 不解析 code 下的 br 标签,需要使用正则替换 br 标签为 \\n 才可以继续解析\n * @param htmlStr\n * @returns\n */\n\nexport function formatCode(htmlStr: string) {\n let code = htmlStr\n\n code = code.replace(/<br>/gi, '\\n')\n\n code = code.replace(/ /gi, ' ')\n\n code = code.replace(/</gi, '<')\n\n code = code.replace(/>/gi, '>')\n\n code = code.replace(/&/gi, '&')\n\n code = code.replace(/"/gi, '\"')\n\n code = code.replace(/'/gi, '‘')\n\n code = code.replace(/×/gi, '*')\n\n code = code.replace(/÷/gi, '%')\n\n const $ = cheerio.load(code)\n\n return $.text()\n}\n\n/**\n * 解决如下格式\n * <figcaption><img><figcaption></figcaption></figcaption>\n * @param figureHTML\n * @returns\n */\nexport function figure2markdown(figureHTML: string) {\n const imgRegex = /<img.*?data-src=['\"](.*?)['\"]/\n\n const descRegex = /\\<figcaption .*?>(.+)<\\/figcaption>/\n\n const imgArr = figureHTML.match(imgRegex)\n\n const descArr = figureHTML.match(descRegex)\n\n let imgUrl = ''\n\n let desc = ''\n\n if (Array.isArray(imgArr)) {\n imgUrl = imgArr[1]\n }\n\n if (Array.isArray(descArr)) {\n desc = descArr[1]\n }\n\n // img 可能没有图片说明\n if (imgUrl) {\n return `\\n\\n  \\n\\n`\n }\n\n return\n}\n"],"mappings":";AAAA,OAAO,WAAmC;AAC1C,SAAS,YAAY;;;ACDd,IAAM,SAET;AAAA,EACA,OAAO;AACX;;;ACOO,IAAW,SAAX,kBAAWA,YAAX;AACH,EAAAA,gBAAA,aAAU,OAAV;AACA,EAAAA,gBAAA,UAAO,OAAP;AAFc,SAAAA;AAAA,GAAA;;;ACRlB,OAAO,qBAAqB;AAC5B,OAAO,uBAAuB;;;ACJ9B,OAAO,aAAa;AAWb,SAAS,WAAW,SAAiB;AACxC,MAAI,OAAO;AAEX,SAAO,KAAK,QAAQ,UAAU,IAAI;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,WAAW,GAAG;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,YAAY,QAAG;AAEnC,SAAO,KAAK,QAAQ,aAAa,GAAG;AAEpC,SAAO,KAAK,QAAQ,cAAc,GAAG;AAErC,QAAM,IAAI,QAAQ,KAAK,IAAI;AAE3B,SAAO,EAAE,KAAK;AAClB;AAQO,SAAS,gBAAgB,YAAoB;AAChD,QAAM,WAAW;AAEjB,QAAM,YAAY;AAElB,QAAM,SAAS,WAAW,MAAM,QAAQ;AAExC,QAAM,UAAU,WAAW,MAAM,SAAS;AAE1C,MAAI,SAAS;AAEb,MAAI,OAAO;AAEX,MAAI,MAAM,QAAQ,MAAM,GAAG;AACvB,aAAS,OAAO;AAAA,EACpB;AAEA,MAAI,MAAM,QAAQ,OAAO,GAAG;AACxB,WAAO,QAAQ;AAAA,EACnB;AAGA,MAAI,QAAQ;AACR,WAAO;AAAA;AAAA,KAAU,SAAS;AAAA;AAAA;AAAA,EAC9B;AAEA;AACJ;;;AD3DA,SAAS,mBAAmB,QAAgB;AACxC,QAAM,kBAAkB,IAAI,gBAAgB;AAAA,IACxC,gBAAgB;AAAA,IAChB,IAAI;AAAA,EACR,CAAC;AAED,oBAAkB,IAAI,eAAe;AAErC,MAAI,eAAe;AAGnB,kBACK,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,QAAQ;AAEpB,YAAM,SAAS,QAAQ,OAAO,OAAO,QAAQ,MAAM,OAAO;AAE1D,UAAI,eAAe;AAEnB,UAAI,QAAQ;AACR,uBAAe,WAAW,KAAK,SAAS;AAAA,MAC5C;AAEA,YAAM,MAAM,SAAS,eAAe;AAEpC,aAAO,UAAU,MAAM;AAAA,IAC3B;AAAA,EACJ,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,KAAK,aAAa,UAAU,KAAK;AAE7C,aAAO,MAAM;AAAA;AAAA,MAAW;AAAA;AAAA,IAAc;AAAA,IAC1C;AAAA,EACJ,CAAC,EACA,QAAQ,SAAS;AAAA,IACd,QAAQ,CAAC,SAAsB;AAC3B,aACI,KAAK,QAAQ,YAAY,MAAM,YAC/B,KAAK,UAAU,SAAS,cAAc;AAAA,IAE9C;AAAA,IACA,YAAY,SAAS,OAAa;AAC9B,YAAM,OAAO;AAEb,YAAM,QAAQ;AAAA,QACV,KAAK,aAAa,YAAY,KAAK;AAAA,MACvC;AAEA,YAAM,IAAI,IAAI,IAAI,OAAO,GAAG;AAC5B,QAAE,OAAO,yBAAyB;AAElC,aAAO,QAAQ;AAAA;AAAA,OAAY,WAAW,EAAE;AAAA;AAAA,IAAe;AAAA,IAC3D;AAAA,EACJ,CAAC,EACA,QAAQ,cAAc;AAAA,IACnB,QAAQ;AAAA,IACR,aAAa,MAAM;AAAA,EACvB,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,QAAQ;AAAA,IACjB,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,gBAAgB,KAAK,SAAS;AAC1C,aAAO,OAAO;AAAA,IAClB;AAAA,EACJ,CAAC;AAEL,SAAO;AACX;;;AH3EA,IAAM,WAAW,CAAC,SAAiB;AAC/B,SAAO;AAAA,IACH;AAAA,IACA,SAAS;AAAA,IACT,KAAK,OAAO;AAAA,EAChB;AACJ;AAIA,eAAsB,UAAU,SAAiB,MAAuB;AAjBxE;AAkBI,QAAM,IAAI,KAAK,OAAO;AAEtB,MAAI,QAAQ,EAAE,gBAAgB,EAAE,KAAK;AAErC,UAAQ,MAAM,KAAK,KAAK;AACxB,QAAM,SAAS,MAAM;AAAA,IACjB,IAAI;AAAA,MACA;AAAA,SACI,OAAE,qBAAqB,MAAvB,mBAA0B,KAAK;AAAA,QAC/B,GAAG,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,IAAI;AAAA,MACtC,EACK,IAAI,CAAC,SAAU,OAAO,KAAK,KAAK,IAAI,EAAG,EACvC,OAAO,OAAO;AAAA,IACvB;AAAA,EACJ,EAAE,KAAK,IAAI;AAEX,QAAM,SAAS,EAAE,aAAa;AAC9B,QAAM,OAAO,OAAO,KAAK;AAEzB,MAAI,QAAQ,KAAK,SAAS,GAAG;AACzB,QAAI,MAAM,mBAAmB,IAAI,EAAE,SAAS,IAAI;AAEhD,UAAM,MAAM;AAAA;AAAA,kBAAyB;AAAA;AAAA,IAAiB;AAEtD,WAAO;AAAA,MACH,SAAS;AAAA,MACT;AAAA,MACA,MAAM;AAAA,QACF;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACb;AAAA,IACJ;AAAA,EACJ;AAEA,SAAO,uBAAoB;AAC/B;AASA,eAAO,uBACH,KACA,UAAyC,CAAC,GACnB;AACvB,QAAM,EAAE,cAAc,CAAC,EAAE,IAAI;AAC7B,QAAM,EAAE,UAAU,CAAC,MAAM,WAAW,IAAI;AAExC,QAAM,IAAI,IAAI,IAAI,GAAG;AAGrB,IAAE,aAAa,OAAO,WAAW;AAEjC,MAAI;AACA,UAAM,MAAM,MAAM,MAAM,IAAI,EAAE,MAAM;AAAA,MAChC,SAAS;AAAA,MACT,cAAc;AAAA,MACd,SAAS;AAAA,QACL,KAAK;AAAA,QACL,6BAA6B;AAAA,QAC7B,cACI;AAAA,QACJ,GAAG;AAAA,MACP;AAAA,MACA,GAAG;AAAA,IACP,CAAC;AAED,WAAO,UAAU,IAAI,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC;AAAA,EAC9C,SAAS,KAAP;AACE,YAAQ,IAAI,GAAG;AACf,WAAO,uBAAoB;AAAA,EAC/B;AACJ;","names":["Status"]}
|
|
1
|
+
{"version":3,"sources":["../src/index.ts","../src/error.ts","../src/type.ts","../src/parsers/wechat.ts","../src/turndownCode.ts","../src/formatHtml.ts","../src/parsers/general.ts"],"sourcesContent":["import axios, { AxiosRequestConfig } from 'axios'\nimport { errObj } from './error'\nimport type { TurnDownResult } from './type'\nimport { Status } from './type'\nimport { parseWeChatPage } from './parsers/wechat'\nimport { parseGeneralHTML } from './parsers/general'\n\nconst getError = (code: number) => {\n return {\n code,\n success: false,\n msg: errObj[code],\n }\n}\n\nexport { TurnDownResult, Status }\n\nexport function isWechatPage(html: string) {\n return html?.includes('res.wx.qq.com')\n}\n\nexport async function parseHTML(\n html: string,\n meta: { url: string }\n): Promise<TurnDownResult> {\n let result: TurnDownResult | null = null\n\n if (isWechatPage(html)) {\n result = await parseWeChatPage(html, meta)\n }\n if (!result) {\n // 兜底处理\n result = await parseGeneralHTML(html, meta)\n }\n\n if (result) {\n return result\n }\n\n return getError(Status.Fail)\n}\n\nexport async function transformHtml2Markdown(\n html: string,\n /**\n * 这里的 url 是原始的 url,主要是用来映射内部跳转链接\n */\n url: string\n): Promise<TurnDownResult> {\n try {\n return parseHTML(html, { url })\n } catch (err) {\n console.log(err)\n return getError(Status.Fail)\n }\n}\n\n/**\n * 支持添加代理服务器\n */\ninterface TransformHtml2MarkdownOptions {\n axiosConfig?: AxiosRequestConfig\n}\n\nexport async function transformUrl2Markdown(\n url: string,\n options: TransformHtml2MarkdownOptions = {}\n): Promise<TurnDownResult> {\n const { axiosConfig = {} } = options\n const { headers = {}, ...restConfig } = axiosConfig\n\n const u = new URL(url)\n // 移除该参数\n // 避免出现 302 跳转\n u.searchParams.delete('poc_token')\n\n try {\n const res = await axios.get(u.href, {\n timeout: 30000,\n maxRedirects: 5,\n headers: {\n DNT: '1',\n 'Upgrade-Insecure-Requests': '1',\n 'User-Agent':\n 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',\n ...headers,\n },\n ...restConfig,\n })\n\n return transformHtml2Markdown(res.data, url)\n } catch (err) {\n console.log(err)\n return getError(Status.Fail)\n }\n}\n\nexport { getTurnDownService } from './turndownCode'\n","export const errObj: {\n [key: number]: string\n} = {\n '400': '内容解析失败',\n}\n","export interface TurnDownResult {\n success: boolean\n code: number\n data?: {\n title?: string\n author?: string\n content?: string\n }\n msg?: string\n}\n\nexport const enum Status {\n Success = 200,\n Fail = 400,\n}\n","import { load } from 'cheerio'\nimport { Status } from '../type'\nimport { getTurnDownService } from '../turndownCode'\n\nexport async function parseWeChatPage(htmlRaw: string, meta: { url: string }) {\n const $ = load(htmlRaw)\n\n const title = ($('#activity-name').text() || '').trim()\n const author = Array.from(\n new Set(\n [\n $('meta[name=\"author\"]')?.attr('content'),\n ...$('#js_name').text().split('\\n'),\n ]\n .map((item) => (item ? item.trim() : ''))\n .filter(Boolean)\n )\n ).join('\\n')\n\n const htmlEl = $('#js_content')\n const html = htmlEl.html()\n\n if (html?.length) {\n let res = getTurnDownService(meta).turndown(html)\n\n res = `## ${title} \\n \\n` + `## 作者 ${author} \\n \\n` + res\n\n return {\n success: true,\n code: Status.Success,\n data: {\n title,\n author,\n content: res,\n },\n }\n }\n\n return null\n}\n","/**\n * html 转换 markdown 格式\n */\nimport turnDownService from 'turndown'\nimport TurndownPluginGfm from '@guyplusplus/turndown-plugin-gfm'\nimport { formatCode, figure2markdown } from './formatHtml'\n\ninterface Params {\n url: string\n}\n\nfunction getTurnDownService(params: Params) {\n const turndownService = new turnDownService({\n codeBlockStyle: 'fenced',\n hr: '',\n })\n\n TurndownPluginGfm.gfm(turndownService)\n\n let videoCounter = 0\n\n // 自定义配置\n turndownService\n .addRule('pre2Code', {\n filter: ['pre'],\n replacement(content, node: any) {\n const len = content.length\n // 微信文章获取到的 content, 会出现首尾都有 '`'\n const isCode = content[0] === '`' && content[len - 1] === '`'\n\n let pre_Markdown = ''\n\n if (isCode) {\n pre_Markdown = formatCode(node.innerHTML)\n }\n\n const res = isCode ? pre_Markdown : content\n\n return '```\\n' + res + '\\n```\\n'\n },\n })\n .addRule('getImage', {\n filter: ['img'],\n replacement(content, node: any) {\n const src = node.getAttribute('data-src') || ''\n\n return src ? `\\n\\n \\n\\n` : ''\n },\n })\n .addRule('video', {\n filter: (node: HTMLElement) => {\n return (\n node.tagName.toLowerCase() === 'iframe' &&\n node.className.includes('video_iframe')\n )\n },\n replacement(content, _node: Node) {\n const node = _node as HTMLIFrameElement\n\n const cover = decodeURIComponent(\n node.getAttribute('data-cover') || ''\n )\n\n const u = new URL(params.url)\n u.hash = `js_mp_video_container_${videoCounter++}`\n\n return cover ? `\\n\\n[](${u.href}) \\n\\n` : ''\n },\n })\n .addRule('lineBreaks', {\n filter: 'br',\n replacement: () => '\\n',\n })\n .addRule('img2Code', {\n filter: ['figure'],\n replacement(content, node: any) {\n const res = figure2markdown(node.innerHTML)\n return res || ''\n },\n })\n\n return turndownService\n}\n\nexport { getTurnDownService }\n","import cheerio from 'cheerio'\n\n/**\n * 微信不同代码风格\n * 1. <code><span>code</span></code>\n * 2. <code><span><span>123</span><br></span></code>\n * turndown 不解析 code 下的 br 标签,需要使用正则替换 br 标签为 \\n 才可以继续解析\n * @param htmlStr\n * @returns\n */\n\nexport function formatCode(htmlStr: string) {\n let code = htmlStr\n\n code = code.replace(/<br>/gi, '\\n')\n\n code = code.replace(/ /gi, ' ')\n\n code = code.replace(/</gi, '<')\n\n code = code.replace(/>/gi, '>')\n\n code = code.replace(/&/gi, '&')\n\n code = code.replace(/"/gi, '\"')\n\n code = code.replace(/'/gi, '‘')\n\n code = code.replace(/×/gi, '*')\n\n code = code.replace(/÷/gi, '%')\n\n const $ = cheerio.load(code)\n\n return $.text()\n}\n\n/**\n * 解决如下格式\n * <figcaption><img><figcaption></figcaption></figcaption>\n * @param figureHTML\n * @returns\n */\nexport function figure2markdown(figureHTML: string) {\n const imgRegex = /<img.*?data-src=['\"](.*?)['\"]/\n\n const descRegex = /\\<figcaption .*?>(.+)<\\/figcaption>/\n\n const imgArr = figureHTML.match(imgRegex)\n\n const descArr = figureHTML.match(descRegex)\n\n let imgUrl = ''\n\n let desc = ''\n\n if (Array.isArray(imgArr)) {\n imgUrl = imgArr[1]\n }\n\n if (Array.isArray(descArr)) {\n desc = descArr[1]\n }\n\n // img 可能没有图片说明\n if (imgUrl) {\n return `\\n\\n  \\n\\n`\n }\n\n return\n}\n","import { load } from 'cheerio'\nimport { Status } from '../type'\nimport { getTurnDownService } from '../turndownCode'\n\nexport async function parseGeneralHTML(htmlRaw: string, meta: { url: string }) {\n const $ = load(htmlRaw)\n $('script').remove()\n\n const title = ($('title').text() || '').trim()\n const author = ($('meta[name=\"author\"]')?.attr('content') || '').trim()\n\n const html = $('body').html()\n if (html?.length) {\n let res = getTurnDownService(meta).turndown(html)\n\n res = `## ${title} \\n \\n` + `## 作者 ${author} \\n \\n` + res\n\n return {\n success: true,\n code: Status.Success,\n data: {\n title,\n author,\n content: res,\n },\n }\n }\n\n return null\n}\n"],"mappings":";AAAA,OAAO,WAAmC;;;ACAnC,IAAM,SAET;AAAA,EACA,OAAO;AACX;;;ACOO,IAAW,SAAX,kBAAWA,YAAX;AACH,EAAAA,gBAAA,aAAU,OAAV;AACA,EAAAA,gBAAA,UAAO,OAAP;AAFc,SAAAA;AAAA,GAAA;;;ACXlB,SAAS,YAAY;;;ACGrB,OAAO,qBAAqB;AAC5B,OAAO,uBAAuB;;;ACJ9B,OAAO,aAAa;AAWb,SAAS,WAAW,SAAiB;AACxC,MAAI,OAAO;AAEX,SAAO,KAAK,QAAQ,UAAU,IAAI;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,UAAU,GAAG;AAEjC,SAAO,KAAK,QAAQ,WAAW,GAAG;AAElC,SAAO,KAAK,QAAQ,YAAY,GAAG;AAEnC,SAAO,KAAK,QAAQ,YAAY,QAAG;AAEnC,SAAO,KAAK,QAAQ,aAAa,GAAG;AAEpC,SAAO,KAAK,QAAQ,cAAc,GAAG;AAErC,QAAM,IAAI,QAAQ,KAAK,IAAI;AAE3B,SAAO,EAAE,KAAK;AAClB;AAQO,SAAS,gBAAgB,YAAoB;AAChD,QAAM,WAAW;AAEjB,QAAM,YAAY;AAElB,QAAM,SAAS,WAAW,MAAM,QAAQ;AAExC,QAAM,UAAU,WAAW,MAAM,SAAS;AAE1C,MAAI,SAAS;AAEb,MAAI,OAAO;AAEX,MAAI,MAAM,QAAQ,MAAM,GAAG;AACvB,aAAS,OAAO;AAAA,EACpB;AAEA,MAAI,MAAM,QAAQ,OAAO,GAAG;AACxB,WAAO,QAAQ;AAAA,EACnB;AAGA,MAAI,QAAQ;AACR,WAAO;AAAA;AAAA,KAAU,SAAS;AAAA;AAAA;AAAA,EAC9B;AAEA;AACJ;;;AD3DA,SAAS,mBAAmB,QAAgB;AACxC,QAAM,kBAAkB,IAAI,gBAAgB;AAAA,IACxC,gBAAgB;AAAA,IAChB,IAAI;AAAA,EACR,CAAC;AAED,oBAAkB,IAAI,eAAe;AAErC,MAAI,eAAe;AAGnB,kBACK,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,QAAQ;AAEpB,YAAM,SAAS,QAAQ,OAAO,OAAO,QAAQ,MAAM,OAAO;AAE1D,UAAI,eAAe;AAEnB,UAAI,QAAQ;AACR,uBAAe,WAAW,KAAK,SAAS;AAAA,MAC5C;AAEA,YAAM,MAAM,SAAS,eAAe;AAEpC,aAAO,UAAU,MAAM;AAAA,IAC3B;AAAA,EACJ,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,KAAK;AAAA,IACd,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,KAAK,aAAa,UAAU,KAAK;AAE7C,aAAO,MAAM;AAAA;AAAA,MAAW;AAAA;AAAA,IAAc;AAAA,IAC1C;AAAA,EACJ,CAAC,EACA,QAAQ,SAAS;AAAA,IACd,QAAQ,CAAC,SAAsB;AAC3B,aACI,KAAK,QAAQ,YAAY,MAAM,YAC/B,KAAK,UAAU,SAAS,cAAc;AAAA,IAE9C;AAAA,IACA,YAAY,SAAS,OAAa;AAC9B,YAAM,OAAO;AAEb,YAAM,QAAQ;AAAA,QACV,KAAK,aAAa,YAAY,KAAK;AAAA,MACvC;AAEA,YAAM,IAAI,IAAI,IAAI,OAAO,GAAG;AAC5B,QAAE,OAAO,yBAAyB;AAElC,aAAO,QAAQ;AAAA;AAAA,OAAY,WAAW,EAAE;AAAA;AAAA,IAAe;AAAA,IAC3D;AAAA,EACJ,CAAC,EACA,QAAQ,cAAc;AAAA,IACnB,QAAQ;AAAA,IACR,aAAa,MAAM;AAAA,EACvB,CAAC,EACA,QAAQ,YAAY;AAAA,IACjB,QAAQ,CAAC,QAAQ;AAAA,IACjB,YAAY,SAAS,MAAW;AAC5B,YAAM,MAAM,gBAAgB,KAAK,SAAS;AAC1C,aAAO,OAAO;AAAA,IAClB;AAAA,EACJ,CAAC;AAEL,SAAO;AACX;;;AD9EA,eAAsB,gBAAgB,SAAiB,MAAuB;AAJ9E;AAKI,QAAM,IAAI,KAAK,OAAO;AAEtB,QAAM,SAAS,EAAE,gBAAgB,EAAE,KAAK,KAAK,IAAI,KAAK;AACtD,QAAM,SAAS,MAAM;AAAA,IACjB,IAAI;AAAA,MACA;AAAA,SACI,OAAE,qBAAqB,MAAvB,mBAA0B,KAAK;AAAA,QAC/B,GAAG,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,IAAI;AAAA,MACtC,EACK,IAAI,CAAC,SAAU,OAAO,KAAK,KAAK,IAAI,EAAG,EACvC,OAAO,OAAO;AAAA,IACvB;AAAA,EACJ,EAAE,KAAK,IAAI;AAEX,QAAM,SAAS,EAAE,aAAa;AAC9B,QAAM,OAAO,OAAO,KAAK;AAEzB,MAAI,6BAAM,QAAQ;AACd,QAAI,MAAM,mBAAmB,IAAI,EAAE,SAAS,IAAI;AAEhD,UAAM,MAAM;AAAA;AAAA,kBAAyB;AAAA;AAAA,IAAiB;AAEtD,WAAO;AAAA,MACH,SAAS;AAAA,MACT;AAAA,MACA,MAAM;AAAA,QACF;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACb;AAAA,IACJ;AAAA,EACJ;AAEA,SAAO;AACX;;;AGvCA,SAAS,QAAAC,aAAY;AAIrB,eAAsB,iBAAiB,SAAiB,MAAuB;AAJ/E;AAKI,QAAM,IAAIC,MAAK,OAAO;AACtB,IAAE,QAAQ,EAAE,OAAO;AAEnB,QAAM,SAAS,EAAE,OAAO,EAAE,KAAK,KAAK,IAAI,KAAK;AAC7C,QAAM,YAAU,OAAE,qBAAqB,MAAvB,mBAA0B,KAAK,eAAc,IAAI,KAAK;AAEtE,QAAM,OAAO,EAAE,MAAM,EAAE,KAAK;AAC5B,MAAI,6BAAM,QAAQ;AACd,QAAI,MAAM,mBAAmB,IAAI,EAAE,SAAS,IAAI;AAEhD,UAAM,MAAM;AAAA;AAAA,kBAAyB;AAAA;AAAA,IAAiB;AAEtD,WAAO;AAAA,MACH,SAAS;AAAA,MACT;AAAA,MACA,MAAM;AAAA,QACF;AAAA,QACA;AAAA,QACA,SAAS;AAAA,MACb;AAAA,IACJ;AAAA,EACJ;AAEA,SAAO;AACX;;;ANtBA,IAAM,WAAW,CAAC,SAAiB;AAC/B,SAAO;AAAA,IACH;AAAA,IACA,SAAS;AAAA,IACT,KAAK,OAAO;AAAA,EAChB;AACJ;AAIO,SAAS,aAAa,MAAc;AACvC,SAAO,6BAAM,SAAS;AAC1B;AAEA,eAAsB,UAClB,MACA,MACuB;AACvB,MAAI,SAAgC;AAEpC,MAAI,aAAa,IAAI,GAAG;AACpB,aAAS,MAAM,gBAAgB,MAAM,IAAI;AAAA,EAC7C;AACA,MAAI,CAAC,QAAQ;AAET,aAAS,MAAM,iBAAiB,MAAM,IAAI;AAAA,EAC9C;AAEA,MAAI,QAAQ;AACR,WAAO;AAAA,EACX;AAEA,SAAO,uBAAoB;AAC/B;AAEA,eAAsB,uBAClB,MAIA,KACuB;AACvB,MAAI;AACA,WAAO,UAAU,MAAM,EAAE,IAAI,CAAC;AAAA,EAClC,SAAS,KAAP;AACE,YAAQ,IAAI,GAAG;AACf,WAAO,uBAAoB;AAAA,EAC/B;AACJ;AASA,eAAsB,sBAClB,KACA,UAAyC,CAAC,GACnB;AACvB,QAAM,EAAE,cAAc,CAAC,EAAE,IAAI;AAC7B,QAAM,EAAE,UAAU,CAAC,MAAM,WAAW,IAAI;AAExC,QAAM,IAAI,IAAI,IAAI,GAAG;AAGrB,IAAE,aAAa,OAAO,WAAW;AAEjC,MAAI;AACA,UAAM,MAAM,MAAM,MAAM,IAAI,EAAE,MAAM;AAAA,MAChC,SAAS;AAAA,MACT,cAAc;AAAA,MACd,SAAS;AAAA,QACL,KAAK;AAAA,QACL,6BAA6B;AAAA,QAC7B,cACI;AAAA,QACJ,GAAG;AAAA,MACP;AAAA,MACA,GAAG;AAAA,IACP,CAAC;AAED,WAAO,uBAAuB,IAAI,MAAM,GAAG;AAAA,EAC/C,SAAS,KAAP;AACE,YAAQ,IAAI,GAAG;AACf,WAAO,uBAAoB;AAAA,EAC/B;AACJ;","names":["Status","load","load"]}
|